# Loading Regex library
library(qdapRegex)

Attaching package: ‘qdapRegex’

The following object is masked from ‘package:dplyr’:

    explain

The following object is masked from ‘package:ggplot2’:

    %+%
# Extract tweet text from climate dataset
twt_txt <- climate_twts$text
head(twt_txt)
[1] "I don't think it's too much of an exaggeration to say everyone's fate on the planet probably hinges on the dems winning these 2 Georgia Senate seats the science is clear https://t.co/qmFkKqOwC2 https://t.co/M04ZS22LB0"                                                               
[2] "BOM and CSIRO State of the Climate 2020 shows Australia is experiencing climate change now\nhttps://t.co/HdxntH6ziX"                                                                                                                                                                     
[3] "\"[this] “offensive inquisitiveness” whose goal of humiliating others [is] inherently objectionable... “gossip derived from malicious judgment of others” could only “cast a shadow of worthlessness” over humanity + climate of rage ... inimical to civil peace and social progress.\""
[4] "Trashing Labor on Climate policy,\nHas Michelle looked at the government??\n\nApparently it’s ok to question China on everything but it’s not ok to question thing’s attacks on US democracy.\n\nhttps://t.co/Djschiqc9g"                                                                
[5] "@BigDuke6__ @djmirk @CytometerMan @DPWIMM @realDonaldTrump @ChanelRion @OANN Funny, I thought that was climate change??"                                                                                                                                                                 
[6] "@timinmitcham Expect thousands of climate change refugees to seek asylum!"                                                                                                                                                                                                               
# Remove URLs from the tweet text
twt_txt_url <- rm_twitter_url(twt_txt)

# Replace special characters, punctuation, & numbers with spaces
twt_txt_chrs  <- gsub("[^A-Za-z]"," " , twt_txt_url)

# Loading text mining library
library(tm)
Loading required package: NLP

Attaching package: ‘NLP’

The following object is masked from ‘package:ggplot2’:

    annotate
# Convert text in "twt_gsub" dataset to a text corpus
twt_corpus <- twt_txt_chrs %>% 
                VectorSource() %>% 
                Corpus() 

# Convert the corpus to lowercase
twt_corpus_lwr <- tm_map(twt_corpus, tolower) 
transformation drops documents
# Remove English stop words from the corpus using SMART dictionary and view the corpus
twt_corpus_stpwd <- tm_map(twt_corpus_lwr, removeWords, stopwords("smart"))
transformation drops documents
head(twt_corpus_stpwd$content)
[1] " don         exaggeration     fate   planet  hinges   dems winning    georgia senate seats  science  clear"                                                                                                                      
[2] "bom  csiro state   climate      shows australia  experiencing climate change "                                                                                                                                                   
[3] "     offensive inquisitiveness   goal  humiliating     inherently objectionable     gossip derived  malicious judgment       cast  shadow  worthlessness   humanity   climate  rage     inimical  civil peace  social progress  "
[4] "trashing labor  climate policy   michelle looked   government   apparently     question china         question thing  attacks   democracy "                                                                                      
[5] " bigduke     djmirk  cytometerman  dpwimm  realdonaldtrump  chanelrion  oann funny   thought   climate change  "                                                                                                                 
[6] " timinmitcham expect thousands  climate change refugees  seek asylum "                                                                                                                                                           
# Remove additional spaces from the corpus
twt_corpus_spaces <- tm_map(twt_corpus_stpwd, stripWhitespace)
transformation drops documents
# Loading library for text analysis
library(qdap)
Loading required package: qdapDictionaries
Loading required package: qdapTools
Registered S3 method overwritten by 'data.table':
  method           from
  print.data.table     

Attaching package: ‘qdapTools’

The following object is masked from ‘package:dplyr’:

    id

Loading required package: RColorBrewer

Attaching package: ‘qdap’

The following objects are masked from ‘package:tm’:

    as.DocumentTermMatrix, as.TermDocumentMatrix

The following object is masked from ‘package:NLP’:

    ngrams

The following object is masked from ‘package:rtweet’:

    %>%

The following object is masked from ‘package:forcats’:

    %>%

The following object is masked from ‘package:stringr’:

    %>%

The following object is masked from ‘package:dplyr’:

    %>%

The following object is masked from ‘package:purrr’:

    %>%

The following object is masked from ‘package:tidyr’:

    %>%

The following object is masked from ‘package:tibble’:

    %>%

The following objects are masked from ‘package:base’:

    Filter, proportions
# Extract term frequencies for top 60 words and view output
termfreq  <-  freq_terms(twt_corpus_spaces, 60)
termfreq
# Create a vector of custom stop words
custom_stopwds <- c("amp", "ve", "don", "lo", "climate", "change")

# Remove custom stop words and create a refined corpus
corp_refined <- tm_map(twt_corpus_spaces, removeWords, custom_stopwds) 
transformation drops documents
# Extract term frequencies for the top 25 words
termfreq_25w <- freq_terms(corp_refined, 25)

# Identify terms with more than 30 counts from the top 25 list
term30 <- subset(termfreq_25w, FREQ > 30)


# Barchart
term30 %>% 
ggplot() +
aes(x = reorder(WORD, -FREQ), y = FREQ) +
        geom_bar(stat = "identity", fill = "blue") + 
        theme(axis.text.x = element_text(angle = 45, hjust = 1))

# Create word cloud with 10 colors and max 30 words
wordcloud(corp_refined, max.words = 30, 
    colors = brewer.pal(10, "Dark2"), 
    scale=c(4,1), random.order = FALSE)
n too large, allowed maximum for palette Dark2 is 8
Returning the palette you asked for with that many colors


# Load libraries
library(topicmodels)


# Create a document term matrix (DTM) for *climate*
dtm_climate <- DocumentTermMatrix(corp_refined)

# Find the sum of word counts in each document
rowTotals <- apply(dtm_climate, 1, sum)

# Select rows with a row total greater than zero
dtm_climate_new <- dtm_climate[rowTotals > 0, ]

# Create a topic model with 10 topics
topicmodl_10 <- LDA(dtm_climate_new, k = 10)

# Select and view the top 10 terms in the topic model
top_10terms <- terms(topicmodl_10, 10)
top_10terms 
      Topic 1      Topic 2   Topic 3   Topic 4     Topic 5       Topic 6     Topic 7    
 [1,] "trump"      "people"  "action"  "global"    "time"        "global"    "work"     
 [2,] "crisis"     "biden"   "make"    "biden"     "covid"       "biden"     "people"   
 [3,] "biden"      "action"  "global"  "action"    "environment" "energy"    "biden"    
 [4,] "snow"       "science" "science" "oil"       "green"       "years"     "energy"   
 [5,] "action"     "energy"  "biden"   "world"     "snow"        "today"     "policy"   
 [6,] "nov"        "covid"   "time"    "australia" "world"       "emissions" "world"    
 [7,] "oil"        "health"  "live"    "crisis"    "great"       "people"    "political"
 [8,] "years"      "make"    "crisis"  "big"       "planet"      "crisis"    "crisis"   
 [9,] "government" "report"  "years"   "report"    "global"      "science"   "good"     
[10,] "green"      "years"   "good"    "science"   "pandemic"    "trump"     "future"   
      Topic 8      Topic 9   Topic 10   
 [1,] "biden"      "people"  "people"   
 [2,] "trump"      "world"   "world"    
 [3,] "action"     "csiro"   "biden"    
 [4,] "covid"      "global"  "justice"  
 [5,] "future"     "control" "make"     
 [6,] "world"      "time"    "crisis"   
 [7,] "current"    "great"   "current"  
 [8,] "year"       "lost"    "future"   
 [9,] "government" "year"    "emissions"
[10,] "carbon"     "state"   "health"   
library(syuzhet)

Attaching package: ‘syuzhet’

The following object is masked from ‘package:rtweet’:

    get_tokens
# Perform sentiment analysis for tweets on `ClimateCrisis` 
sa.value <- get_nrc_sentiment(climate_twts$text)
`filter_()` is deprecated as of dplyr 0.7.0.
Please use `filter()` instead.
See vignette('programming') for more help
This warning is displayed once every 8 hours.
Call `lifecycle::last_warnings()` to see where this warning was generated.`group_by_()` is deprecated as of dplyr 0.7.0.
Please use `group_by()` instead.
See vignette('programming') for more help
This warning is displayed once every 8 hours.
Call `lifecycle::last_warnings()` to see where this warning was generated.`data_frame()` is deprecated as of tibble 1.1.0.
Please use `tibble()` instead.
This warning is displayed once every 8 hours.
Call `lifecycle::last_warnings()` to see where this warning was generated.
# View the sentiment scores
head(sa.value, 10)
# Calculate sum of sentiment scores
score <- colSums(sa.value[,])

# Convert the sum of scores to a data frame
score_df <- data.frame(score)

# Convert row names into 'sentiment' column and combine with sentiment scores
score_df2 <- cbind(sentiment = row.names(score_df),  
                  score_df, row.names = NULL)
print(score_df2)

# Plot the sentiment scores
ggplot(data = score_df2, aes(x = sentiment, y = score, fill = sentiment)) +
     geom_bar(stat = "identity") +
       theme(axis.text.x = element_text(angle = 45, hjust = 1))

# Compute the in-degree scores from the retweet network
in_degree <- degree(nw_rply, mode = c("in"))

# Sort the in-degree scores in decreasing order
in_degree_sort <- sort(in_degree, decreasing = TRUE)

# View users with the top 10 in-degree scores
in_degree_sort[1:10]
   BorisJohnson    senatemajldr          afneil    KamalaHarris     seanhannity          mcuban 
            179              74              70              63              57              51 
            ozm realDonaldTrump        JoeBiden       zerohedge 
             43              41              30              24 
# Calculate the betweenness scores from the retweet network
Warning messages:
1: In readChar(file, size, TRUE) : truncating string with embedded nuls
2: In readChar(file, size, TRUE) : truncating string with embedded nuls
3: In readChar(file, size, TRUE) : truncating string with embedded nuls
4: In readChar(file, size, TRUE) : truncating string with embedded nuls
betwn_nw <- betweenness(nw_rply, directed = TRUE)

# Sort betweenness scores in decreasing order and round the values
betwn_nw_sort <- betwn_nw %>%
                    sort(decreasing = TRUE) %>%
                    round()

# View users with the top 10 betweenness scores 
betwn_nw_sort[1:10]
   XRebellionUK    bjames280961 RebeccaElisabe3   richardabetts    Climatehope2        erinbiba 
            203             184              68              39              29              21 
 emilyhewertson        hausfath  T0myBarrient0s  JesseLReynolds 
             20              19              18              14 

library(maps)


# Extract geo-coordinates data to append as new columns
cc_coord <- lat_lng(climate_twts)

# Omit rows with missing geo-coordinates in the data frame
cc_geo <- na.omit(cc_coord[,c("lat", "lng")])

# Plot longitude and latitude values of tweets on UK
map(database = "world", region = "UK(?!r)", fill = TRUE, col = "light green")
with(cc_geo, points(lng, lat, pch = 20, cex = 1, col = 'blue'))


# Plot longitude and latitude values of tweets on the world map
map(database = "world", fill = TRUE, col = "light green")
with(cc_geo, points(lng, lat, pch = 20, cex = 1, col = 'blue'))

---
title: "R Notebook"
output: html_notebook
---

```{r message=FALSE}
# Load libraries
library(tidyverse)
library(httpuv)
library(rtweet)
library(readr)
library(here)
library(rjson)
```

```{r}

cimate_twts <- read_twitter_csv(here("raw_data/climate_twts.csv"))

```

```{r}
ts_plot(climate_twts, by = "hours", color = "blue")
```

```{r}
# Loading Regex library
library(qdapRegex)

# Extract tweet text from climate dataset
twt_txt <- climate_twts$text
head(twt_txt)

# Remove URLs from the tweet text
twt_txt_url <- rm_twitter_url(twt_txt)

# Replace special characters, punctuation, & numbers with spaces
twt_txt_chrs  <- gsub("[^A-Za-z]"," " , twt_txt_url)

# Loading text mining library
library(tm)

# Convert text in "twt_gsub" dataset to a text corpus
twt_corpus <- twt_txt_chrs %>% 
                VectorSource() %>% 
                Corpus() 

# Convert the corpus to lowercase
twt_corpus_lwr <- tm_map(twt_corpus, tolower) 

# Remove English stop words from the corpus using SMART dictionary and view the corpus
twt_corpus_stpwd <- tm_map(twt_corpus_lwr, removeWords, stopwords("smart"))
head(twt_corpus_stpwd$content)

# Remove additional spaces from the corpus
twt_corpus_spaces <- tm_map(twt_corpus_stpwd, stripWhitespace)

# Loading library for text analysis
library(qdap)

# Extract term frequencies for top 60 words and view output
termfreq  <-  freq_terms(twt_corpus_spaces, 60)
termfreq
```

```{r}
# Create a vector of custom stop words
custom_stopwds <- c("amp", "ve", "don", "lo", "climate", "change")

# Remove custom stop words and create a refined corpus
corp_refined <- tm_map(twt_corpus_spaces, removeWords, custom_stopwds) 

# Extract term frequencies for the top 25 words
termfreq_25w <- freq_terms(corp_refined, 25)

# Identify terms with more than 30 counts from the top 25 list
term30 <- subset(termfreq_25w, FREQ > 30)


# Barchart
term30 %>% 
ggplot() +
aes(x = reorder(WORD, -FREQ), y = FREQ) +
		geom_bar(stat = "identity", fill = "blue") + 
        theme(axis.text.x = element_text(angle = 45, hjust = 1))
```

```{r}
library(RColorBrewer)
library(wordcloud)

# Create word cloud with 10 colors and max 30 words
wordcloud(corp_refined, max.words = 30, 
    colors = brewer.pal(10, "Dark2"), 
    scale=c(4,1), random.order = FALSE)
```

```{r}

# Load libraries
library(topicmodels)


# Create a document term matrix (DTM) for *climate*
dtm_climate <- DocumentTermMatrix(corp_refined)

# Find the sum of word counts in each document
rowTotals <- apply(dtm_climate, 1, sum)

# Select rows with a row total greater than zero
dtm_climate_new <- dtm_climate[rowTotals > 0, ]

# Create a topic model with 10 topics
topicmodl_10 <- LDA(dtm_climate_new, k = 10)

# Select and view the top 10 terms in the topic model
top_10terms <- terms(topicmodl_10, 10)
top_10terms 


```

```{r}
library(syuzhet)

# Perform sentiment analysis for tweets on `climate` 
sa.value <- get_nrc_sentiment(climate_twts$text)

# View the sentiment scores
head(sa.value, 10)
```

```{r}
# Calculate sum of sentiment scores
score <- colSums(sa.value[,])

# Convert the sum of scores to a data frame
score_df <- data.frame(score)

# Convert row names into 'sentiment' column and combine with sentiment scores
score_df2 <- cbind(sentiment = row.names(score_df),  
				  score_df, row.names = NULL)
print(score_df2)

# Plot the sentiment scores
ggplot(data = score_df2, aes(x = sentiment, y = score, fill = sentiment)) +
  	 geom_bar(stat = "identity") +
       theme(axis.text.x = element_text(angle = 45, hjust = 1))
```

```{r message=FALSE}
library(igraph)

# Extract source vertex and target vertex from the tweet data frame
rply_df <- climate_twts[, c("screen_name" , "reply_to_screen_name" )]

# Remove rows with missing values
rply_df_new <- rply_df[complete.cases(rply_df), ]

# Create a matrix
rply_matrx <- as.matrix(rply_df_new)

# Convert the matrix to a reply network
nw_rply <- graph_from_edgelist(el = rply_matrx, directed = TRUE)

# Calculate out-degree scores from the retweet network
out_degree <- degree(nw_rply, mode = c("out"))

# Sort the out-degree scores in decreasing order
out_degree_sort <- sort(out_degree, decreasing = TRUE)

# View users with the top 20 out-degree scores
out_degree_sort[1:20]
```

```{r}
# Compute the in-degree scores from the retweet network
in_degree <- degree(nw_rply, mode = c("in"))

# Sort the in-degree scores in decreasing order
in_degree_sort <- sort(in_degree, decreasing = TRUE)

# View users with the top 10 in-degree scores
in_degree_sort[1:10]
```

```{r}
# Calculate the betweenness scores from the retweet network
betwn_nw <- betweenness(nw_rply, directed = TRUE)

# Sort betweenness scores in decreasing order and round the values
betwn_nw_sort <- betwn_nw %>%
                    sort(decreasing = TRUE) %>%
                    round()

# View users with the top 10 betweenness scores 
betwn_nw_sort[1:10]
```

```{r}
# Create a variable for out-degree
deg_out <- degree(nw_rply, mode = c("out"))
deg_out

# Amplify the out-degree values
vert_size <- (deg_out * 3)
# + 5

#users
user_cos <- users_data(climate_twts) %>%
    filter(followers_count > 1000000)


# Create a column and categorize follower counts above and below 2000
user_cos$follow <- ifelse(user_cos$followers_count > 1000000, "1", "0")

# Assign the new column as vertex attribute to the retweet network
V(nw_rply)$followers <- user_cos$follow
vertex_attr(nw_rply)

# Set the vertex colors based on follower count and create a plot
sub_color <- c("light blue", "light pink")

plot(nw_rply,
     # asp = 30/30,
     vertex.size = deg_out, edge.arrow.size = 0.01,
     vertex.label.cex = 0.01,
     vertex.color = sub_color[as.factor(vertex_attr(nw_rply, "followers"))],
     vertex.label.color = "black", vertex.frame.color = "light grey")
```

```{r}

library(maps)

# Extract geo-coordinates data to append as new columns
cc_coord <- lat_lng(climate_twts)

# Omit rows with missing geo-coordinates in the data frame
cc_geo <- na.omit(cc_coord[,c("lat", "lng")])

# Plot longitude and latitude values of tweets on UK map
map(database = "world", region = "UK(?!r)", fill = TRUE, col = "light green")
with(cc_geo, points(lng, lat, pch = 20, cex = 1, col = 'blue'))

# Plot longitude and latitude values of tweets on the world map
map(database = "world", fill = TRUE, col = "light green")
with(cc_geo, points(lng, lat, pch = 20, cex = 1, col = 'blue'))
```

